Exercice 1:

Question 1.

A l’aide des fonctions sample, rnorm et runif, on génère un échantillon de taille n=100, en considérant ici θ=(μ,σ,π)μ=2, σ=1, π=0.9.

pi<-0.90
mu<-2
sig<- 1
a=10;
c<-1/(2*a)
n<- 1000

x<-vector("numeric",n)
y<-x
for(i in 1:n){
    y[i]=sample(c(1,0),size=1,prob=c(pi,1-pi))
    if(y[i]==1) x[i]<- rnorm(1,mean=mu,sd=sig) else x[i]<-runif(1,min=-a,max=a)
}

boxplot(x)

dotchart(x)

Question 2.

L’objectif de cette question est de coder un algorithme EM pour ce problème. Pour cela, on pour s’aider du document “EM calculs” sur Moodle ainsi que des slides 27–32 du cours.

# Observed data Loglikelihood
loglik<- function(theta,x){
    phi <- dnorm(x,mean=theta[1],sd=theta[2])
    logL <- sum(log(theta[3]*phi+(1-theta[3])*c))
    return(logL)
}

# EM algorithm
em_outlier <- function(x,theta0,a,epsi){
    go_on<-TRUE
    logL0<- loglik(theta0,x)
    t<-0
    c<-1/(2*a)
    n<-length(x)
    print(c(t,logL0))
    while(go_on){
        t<-t+1
        # E-step
        phi <- dnorm(x,mean=theta0[1],sd=theta0[2])
        y<-  phi*theta0[3]/(phi*theta0[3]+c*(1-theta0[3]))
        # M-step
        S<- sum(y)
        pi<-S/n
        mu<- sum(x*y)/S
        sig<-sqrt(sum(y*(x-mu)^2)/S)
        theta<-c(mu,sig,pi)
        logL<-loglik(theta,x) 
        if (logL-logL0 < epsi){
            go_on <- FALSE
            }
        logL0 <- logL
        theta0<-theta
        print(c(t,logL))
        }
    return(list(loglik=logL,theta=theta,y=y))
}

Question 3.

On va maintenant appliquer cet algorithme sur les données avec différentes initialisations.

mu0<-mean(x) 
sig0<-sd(x)
pi0<-0.5
epsi<-1e-8
theta0<-c(mu0,sig0,pi0)
estim<-em_outlier(x,theta0,a,epsi)
## [1]     0.000 -2250.246
## [1]     1.000 -1789.902
## [1]     2.000 -1721.572
## [1]     3.000 -1714.053
## [1]     4.000 -1712.578
## [1]     5.000 -1712.265
## [1]     6.000 -1712.201
## [1]     7.000 -1712.188
## [1]     8.000 -1712.186
## [1]     9.000 -1712.185
## [1]    10.000 -1712.185
## [1]    11.000 -1712.185
## [1]    12.000 -1712.185
## [1]    13.000 -1712.185
## [1]    14.000 -1712.185
## [1]    15.000 -1712.185
## [1]    16.000 -1712.185
plot(x,1-estim$y)

Question 4.

opt <- optim(theta0, loglik, method="L-BFGS-B", control=list(fnscale=-1), lower = c(-10, 0, 0.01), upper = c(10, 10, .99), x=x)

print(opt$par)     # estimates computed with optim
## [1] 1.9781772 0.9872289 0.9187138
print(estim$theta)  # estimates computed with EM
## [1] 1.9781778 0.9872292 0.9187172

Exercice 2:

Question 1.

Wine data

library(mclust)
## Package 'mclust' version 6.0.0
## Type 'citation("mclust")' for citing this R package in publications.
data<-read.table('wine.txt',header=FALSE)
x<-data[,2:14]
plot(x[,1:7],col=data[,1],pch=data[,1])

plot(x[,8:13],col=data[,1],pch=data[,1])

wine <- Mclust(x)
summary(wine)
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VVE (ellipsoidal, equal orientation) model with 3 components: 
## 
##  log-likelihood   n  df       BIC       ICL
##       -3015.333 178 158 -6849.387 -6850.694
## 
## Clustering table:
##  1  2  3 
## 59 69 50
plot(wine,what="BIC")

plot(wine,what="classification",dimens=1:7)

plot(wine,what="uncertainty",dimens=1:7)

table(data[,1],wine$classification)
##    
##      1  2  3
##   1 59  0  0
##   2  0 69  2
##   3  0  0 48
adjustedRandIndex(data[,1],wine$classification)
## [1] 0.9667411

E.coli data

data<-read.table('ecoli.txt',header=FALSE)
x<-data[,-c(1,4,5,8,9)]
# Real classes
class.new = as.integer(as.factor(data[,9]))

plot(x,col=class.new)

ecoli <- Mclust(x)
summary(ecoli)
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust EVI (diagonal, equal volume, varying shape) model with 3 components: 
## 
##  log-likelihood   n df      BIC     ICL
##        811.4761 336 24 1483.341 1464.26
## 
## Clustering table:
##   1   2   3 
## 145  78 113
table(class.new,ecoli$classification)
##          
## class.new   1   2   3
##         1 139   4   0
##         2   2   4  71
##         3   0   1   1
##         4   0   0   2
##         5   0   1  34
##         6   0  20   0
##         7   0   4   1
##         8   4  44   4
plot(ecoli,what="classification")

adjustedRandIndex(class.new,ecoli$classification)
## [1] 0.6968981

Seeds

data<-read.table('seeds.txt',header=FALSE)
x<-data[,1:7]
seeds <- Mclust(x)
summary(seeds)
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust EEV (ellipsoidal, equal volume and shape) model with 4 components: 
## 
##  log-likelihood   n  df     BIC      ICL
##        1323.903 210 122 1995.46 1991.099
## 
## Clustering table:
##  1  2  3  4 
## 59 56 53 42
plot(seeds)

Some inputs variables are strongly correlated; it may be preferable to extract principal components.

pca<-princomp(x,cor=TRUE)
z<-pca$scores[,1:5]
seeds1 <- Mclust(z)
summary(seeds1)
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust EVE (ellipsoidal, equal volume and orientation) model with 3 components: 
## 
##  log-likelihood   n df      BIC       ICL
##       -797.2031 210 40 -1808.29 -1819.132
## 
## Clustering table:
##  1  2  3 
## 61 73 76
plot(seeds1)

adjustedRandIndex(as.numeric(data[,8]),seeds1$classification)
## [1] 0.8769483